# Get data
cs2.data <- read.csv(file = "F:/R For Real/DDS-Case-Study-2/CaseStudy2-data.csv", sep = ",", header = TRUE)
exclude_factors = c("EmployeeCount",'Over18','StandardHours')
cs2.data = cs2.data %>% dplyr::select(-all_of(exclude_factors))
# Split data into sets of different data types
cs2.data <- cs2.data %>% mutate(AttNum = ifelse(Attrition == "No",0,1))
cs2.numeric = cs2.data %>% dplyr::select(Age, DailyRate, DistanceFromHome, HourlyRate, MonthlyIncome, MonthlyRate, NumCompaniesWorked, PercentSalaryHike,
TotalWorkingYears, TrainingTimesLastYear, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion,
YearsWithCurrManager,AttNum)
# No Apparent NA Values, need to check for other NA identifiers
gg_miss_var(cs2.data)
na_count <- sapply(cs2.data, function(y) sum(length(which(is.na(y)))))
na_count <- data.frame(na_count)
na_count
## na_count
## ID 0
## Age 0
## Attrition 0
## BusinessTravel 0
## DailyRate 0
## Department 0
## DistanceFromHome 0
## Education 0
## EducationField 0
## EmployeeNumber 0
## EnvironmentSatisfaction 0
## Gender 0
## HourlyRate 0
## JobInvolvement 0
## JobLevel 0
## JobRole 0
## JobSatisfaction 0
## MaritalStatus 0
## MonthlyIncome 0
## MonthlyRate 0
## NumCompaniesWorked 0
## OverTime 0
## PercentSalaryHike 0
## PerformanceRating 0
## RelationshipSatisfaction 0
## StockOptionLevel 0
## TotalWorkingYears 0
## TrainingTimesLastYear 0
## WorkLifeBalance 0
## YearsAtCompany 0
## YearsInCurrentRole 0
## YearsSinceLastPromotion 0
## YearsWithCurrManager 0
## AttNum 0
str(cs2.data)
## 'data.frame': 870 obs. of 34 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 32 40 35 32 24 27 41 37 34 34 ...
## $ Attrition : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 2 3 2 2 3 3 3 2 ...
## $ DailyRate : int 117 1308 200 801 567 294 1283 309 1333 653 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 3 2 2 3 2 2 2 3 3 2 ...
## $ DistanceFromHome : int 13 14 18 1 2 10 5 10 10 10 ...
## $ Education : int 4 3 2 4 1 2 5 4 4 4 ...
## $ EducationField : Factor w/ 6 levels "Human Resources",..: 2 4 2 3 6 2 4 2 2 6 ...
## $ EmployeeNumber : int 859 1128 1412 2016 1646 733 1448 1105 1055 1597 ...
## $ EnvironmentSatisfaction : int 2 3 3 3 1 4 2 4 3 4 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 2 1 1 2 2 1 1 2 ...
## $ HourlyRate : int 73 44 60 48 32 32 90 88 87 92 ...
## $ JobInvolvement : int 3 2 3 3 3 3 4 2 3 2 ...
## $ JobLevel : int 2 5 3 3 1 3 1 2 1 2 ...
## $ JobRole : Factor w/ 9 levels "Healthcare Representative",..: 8 6 5 8 7 5 7 8 9 1 ...
## $ JobSatisfaction : int 4 3 4 4 4 1 3 4 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 1 3 3 2 3 1 2 1 2 2 ...
## $ MonthlyIncome : int 4403 19626 9362 10422 3760 8793 2127 6694 2220 5063 ...
## $ MonthlyRate : int 9250 17544 19944 24032 17218 4809 5561 24223 18410 15332 ...
## $ NumCompaniesWorked : int 2 1 2 1 1 1 2 2 1 1 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 2 2 2 1 ...
## $ PercentSalaryHike : int 11 14 11 19 13 21 12 14 19 14 ...
## $ PerformanceRating : int 3 3 3 3 3 4 3 3 3 3 ...
## $ RelationshipSatisfaction: int 3 1 3 3 3 3 1 3 4 2 ...
## $ StockOptionLevel : int 1 0 0 2 0 2 0 3 1 1 ...
## $ TotalWorkingYears : int 8 21 10 14 6 9 7 8 1 8 ...
## $ TrainingTimesLastYear : int 3 2 2 3 2 4 5 5 2 3 ...
## $ WorkLifeBalance : int 2 4 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : int 5 20 2 14 6 9 4 1 1 8 ...
## $ YearsInCurrentRole : int 2 7 2 10 3 7 2 0 1 2 ...
## $ YearsSinceLastPromotion : int 0 4 2 5 1 1 0 0 0 7 ...
## $ YearsWithCurrManager : int 3 9 2 7 3 7 3 0 0 7 ...
## $ AttNum : num 0 0 0 0 0 0 0 0 0 0 ...
summary(cs2.data)
## ID Age Attrition BusinessTravel
## Min. : 1.0 Min. :18.00 No :730 Non-Travel : 94
## 1st Qu.:218.2 1st Qu.:30.00 Yes:140 Travel_Frequently:158
## Median :435.5 Median :35.00 Travel_Rarely :618
## Mean :435.5 Mean :36.83
## 3rd Qu.:652.8 3rd Qu.:43.00
## Max. :870.0 Max. :60.00
##
## DailyRate Department DistanceFromHome Education
## Min. : 103.0 Human Resources : 35 Min. : 1.000 Min. :1.000
## 1st Qu.: 472.5 Research & Development:562 1st Qu.: 2.000 1st Qu.:2.000
## Median : 817.5 Sales :273 Median : 7.000 Median :3.000
## Mean : 815.2 Mean : 9.339 Mean :2.901
## 3rd Qu.:1165.8 3rd Qu.:14.000 3rd Qu.:4.000
## Max. :1499.0 Max. :29.000 Max. :5.000
##
## EducationField EmployeeNumber EnvironmentSatisfaction Gender
## Human Resources : 15 Min. : 1.0 Min. :1.000 Female:354
## Life Sciences :358 1st Qu.: 477.2 1st Qu.:2.000 Male :516
## Marketing :100 Median :1039.0 Median :3.000
## Medical :270 Mean :1029.8 Mean :2.701
## Other : 52 3rd Qu.:1561.5 3rd Qu.:4.000
## Technical Degree: 75 Max. :2064.0 Max. :4.000
##
## HourlyRate JobInvolvement JobLevel
## Min. : 30.00 Min. :1.000 Min. :1.000
## 1st Qu.: 48.00 1st Qu.:2.000 1st Qu.:1.000
## Median : 66.00 Median :3.000 Median :2.000
## Mean : 65.61 Mean :2.723 Mean :2.039
## 3rd Qu.: 83.00 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :100.00 Max. :4.000 Max. :5.000
##
## JobRole JobSatisfaction MaritalStatus MonthlyIncome
## Sales Executive :200 Min. :1.000 Divorced:191 Min. : 1081
## Research Scientist :172 1st Qu.:2.000 Married :410 1st Qu.: 2840
## Laboratory Technician :153 Median :3.000 Single :269 Median : 4946
## Manufacturing Director : 87 Mean :2.709 Mean : 6390
## Healthcare Representative: 76 3rd Qu.:4.000 3rd Qu.: 8182
## Sales Representative : 53 Max. :4.000 Max. :19999
## (Other) :129
## MonthlyRate NumCompaniesWorked OverTime PercentSalaryHike
## Min. : 2094 Min. :0.000 No :618 Min. :11.0
## 1st Qu.: 8092 1st Qu.:1.000 Yes:252 1st Qu.:12.0
## Median :14074 Median :2.000 Median :14.0
## Mean :14326 Mean :2.728 Mean :15.2
## 3rd Qu.:20456 3rd Qu.:4.000 3rd Qu.:18.0
## Max. :26997 Max. :9.000 Max. :25.0
##
## PerformanceRating RelationshipSatisfaction StockOptionLevel TotalWorkingYears
## Min. :3.000 Min. :1.000 Min. :0.0000 Min. : 0.00
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:0.0000 1st Qu.: 6.00
## Median :3.000 Median :3.000 Median :1.0000 Median :10.00
## Mean :3.152 Mean :2.707 Mean :0.7839 Mean :11.05
## 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:1.0000 3rd Qu.:15.00
## Max. :4.000 Max. :4.000 Max. :3.0000 Max. :40.00
##
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole
## Min. :0.000 Min. :1.000 Min. : 0.000 Min. : 0.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.: 3.000 1st Qu.: 2.000
## Median :3.000 Median :3.000 Median : 5.000 Median : 3.000
## Mean :2.832 Mean :2.782 Mean : 6.962 Mean : 4.205
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:10.000 3rd Qu.: 7.000
## Max. :6.000 Max. :4.000 Max. :40.000 Max. :18.000
##
## YearsSinceLastPromotion YearsWithCurrManager AttNum
## Min. : 0.000 Min. : 0.00 Min. :0.0000
## 1st Qu.: 0.000 1st Qu.: 2.00 1st Qu.:0.0000
## Median : 1.000 Median : 3.00 Median :0.0000
## Mean : 2.169 Mean : 4.14 Mean :0.1609
## 3rd Qu.: 3.000 3rd Qu.: 7.00 3rd Qu.:0.0000
## Max. :15.000 Max. :17.00 Max. :1.0000
##
# take a look at collinearity
pairs(cs2.numeric)
# Nothing looks so highly correlated that we should want to get rid of it.
corr <- round(cor(cs2.numeric), 3)
ggcorrplot(corr, hc.order = TRUE, type = "lower",
lab = TRUE, lab_size = 3, method = "square",
colors = c("tomato2", "white", "springgreen3"),
title = "Correlations of Selected Continous Variables")
# Adding in the response
# Attrition Occurs younger invariate of rate
cs2.data %>% ggplot(aes(x = Age, y = DailyRate, color = Attrition)) + geom_point()
# Seems Most workers lives within 10 miles - needs some follow up what percentages - Distance from home invariate with age
cs2.data %>% ggplot(aes(x = Age, y = DistanceFromHome, color = Attrition)) + geom_point()
# Attrition seems to occur younger no matter the rate
cs2.data %>% ggplot(aes(x = Age, y = HourlyRate, color = Attrition)) + geom_point()
# Possible further investigation
cs2.data %>% ggplot(aes(x = Age, y = MonthlyIncome, color = Attrition)) + geom_point()
# Eh
cs2.data %>% ggplot(aes(x = Age, y = MonthlyRate, color = Attrition)) + geom_point()
# Further Investigation
cs2.data %>% ggplot(aes(x = Age, y = NumCompaniesWorked, color = Attrition)) + geom_point()
# Further Investigation
cs2.data %>% ggplot(aes(x = Age, y = PercentSalaryHike, color = Attrition)) + geom_point()
# Further Investigation -
cs2.data %>% ggplot(aes(x = Age, y = TotalWorkingYears, color = Attrition)) + geom_point()
# eh
cs2.data %>% ggplot(aes(x = Age, y = TrainingTimesLastYear, color = Attrition)) + geom_point()
# eh
cs2.data %>% ggplot(aes(x = Age, y = YearsAtCompany, color = Attrition)) + geom_point()
#
cs2.data %>% ggplot(aes(x = Age, y = YearsInCurrentRole, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = Age, y = YearsSinceLastPromotion, color = Attrition)) + geom_point()
# pretty interesting here how zero line is quite telling looks like a majority of attrition happens within 1 year with a manager
cs2.data %>% ggplot(aes(x = Age, y = YearsWithCurrManager, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = YearsWithCurrManager, fill = Attrition)) + geom_histogram(binwidth = 1) +
aes(y = stat(count)/sum(stat(count))) + scale_y_continuous(labels = scales::percent) + facet_wrap(~Attrition)
cs2.data %>% ggplot(aes(x = DailyRate, y = DistanceFromHome, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = DailyRate, y = HourlyRate, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = DailyRate, y = MonthlyIncome, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = DailyRate, y = MonthlyRate, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = DailyRate, y = NumCompaniesWorked, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = DailyRate, y = PercentSalaryHike, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = DailyRate, y = TotalWorkingYears, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = DailyRate, y = TrainingTimesLastYear, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = DailyRate, y = YearsAtCompany, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = DailyRate, y = YearsInCurrentRole, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = DailyRate, y = YearsSinceLastPromotion, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = DailyRate, y = YearsWithCurrManager, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = DistanceFromHome, y = HourlyRate , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = DistanceFromHome, y = MonthlyIncome , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = DistanceFromHome, y = MonthlyRate , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = DistanceFromHome, y = NumCompaniesWorked , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = DistanceFromHome, y = PercentSalaryHike, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = DistanceFromHome, y = TotalWorkingYears, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = DistanceFromHome, y = TrainingTimesLastYear, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = DistanceFromHome, y = YearsAtCompany, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = DistanceFromHome, y = YearsInCurrentRole, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = DistanceFromHome, y = YearsSinceLastPromotion, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = DistanceFromHome, y = YearsWithCurrManager, color = Attrition)) + geom_point()
# Stuff vs Hourly Rate
cs2.data %>% ggplot(aes(x = HourlyRate, y = MonthlyIncome, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = HourlyRate, y = MonthlyRate , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = HourlyRate, y = NumCompaniesWorked , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = HourlyRate, y = PercentSalaryHike , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = HourlyRate, y = TotalWorkingYears , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = HourlyRate, y = TrainingTimesLastYear , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = HourlyRate, y = YearsAtCompany , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = HourlyRate, y = YearsInCurrentRole, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = HourlyRate, y = YearsSinceLastPromotion , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = HourlyRate, y = YearsWithCurrManager , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = MonthlyIncome, y = MonthlyRate , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = MonthlyIncome, y = NumCompaniesWorked , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = MonthlyIncome, y = PercentSalaryHike , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = MonthlyIncome, y = TotalWorkingYears , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = MonthlyIncome, y = TrainingTimesLastYear , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = MonthlyIncome, y = YearsAtCompany , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = MonthlyIncome, y = YearsInCurrentRole, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = MonthlyIncome, y = YearsSinceLastPromotion , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = MonthlyIncome, y = YearsWithCurrManager , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = MonthlyRate, y = NumCompaniesWorked , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = MonthlyRate, y = PercentSalaryHike , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = MonthlyRate, y = TotalWorkingYears , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = MonthlyRate, y = TrainingTimesLastYear , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = MonthlyRate, y = YearsAtCompany , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = MonthlyRate, y = YearsInCurrentRole, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = MonthlyRate, y = YearsSinceLastPromotion , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = MonthlyRate, y = YearsWithCurrManager , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = NumCompaniesWorked, y = PercentSalaryHike , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = NumCompaniesWorked, y = TotalWorkingYears , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = NumCompaniesWorked, y = TrainingTimesLastYear , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = NumCompaniesWorked, y = YearsAtCompany , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = NumCompaniesWorked, y = YearsInCurrentRole, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = NumCompaniesWorked, y = YearsSinceLastPromotion , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = NumCompaniesWorked, y = YearsWithCurrManager , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = PercentSalaryHike, y = TotalWorkingYears , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = PercentSalaryHike, y = TrainingTimesLastYear , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = PercentSalaryHike, y = YearsAtCompany , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = PercentSalaryHike, y = YearsInCurrentRole, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = PercentSalaryHike, y = YearsSinceLastPromotion , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = PercentSalaryHike, y = YearsWithCurrManager , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = TotalWorkingYears, y = TrainingTimesLastYear , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = TotalWorkingYears, y = YearsAtCompany , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = TotalWorkingYears, y = YearsInCurrentRole, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = TotalWorkingYears, y = YearsSinceLastPromotion , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = TotalWorkingYears, y = YearsWithCurrManager , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = TrainingTimesLastYear, y = YearsAtCompany , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = TrainingTimesLastYear, y = YearsInCurrentRole, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = TrainingTimesLastYear, y = YearsSinceLastPromotion , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = TrainingTimesLastYear, y = YearsWithCurrManager , color = Attrition)) + geom_point()
# Years positively correlated with other time measuring variable no suprise
cs2.data %>% ggplot(aes(x = YearsAtCompany, y = YearsInCurrentRole, color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = YearsAtCompany, y = YearsSinceLastPromotion , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = YearsAtCompany, y = YearsWithCurrManager , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = YearsInCurrentRole, y = YearsSinceLastPromotion , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = YearsInCurrentRole, y = YearsWithCurrManager , color = Attrition)) + geom_point()
cs2.data %>% ggplot(aes(x = YearsSinceLastPromotion, y = YearsWithCurrManager , color = Attrition)) + geom_point()
cs2.data %>% filter(Attrition == "Yes") %>% ggplot(aes(x = YearsWithCurrManager)) + geom_histogram(binwidth = 1, fill = "#00BFC4") +
aes(y = stat(count)/sum(stat(count))) + scale_y_continuous(labels = scales::percent) + ylab("Percent Attrition") + xlab("Years With Current Manager") +
ggtitle("Distribution of Attrition by Years with Current Manager") +
geom_vline(xintercept = 2, color = "red", size = 1.5, linetype = "dashed") +
scale_x_continuous(breaks = sort(c(seq(0,20,5), 3)), limits = c(0,20), expand = c(0,0))
# Of those who leave 40% do so before the end of the first year, 60% by year 3, and by year 7 90%
cs2.data %>% filter(Attrition == "Yes") %>% ggplot(aes(x = YearsWithCurrManager)) + stat_ecdf() +
scale_y_continuous(breaks = seq(0,1,0.1)) + scale_x_continuous(breaks = seq(1,15,1)) +
geom_vline(xintercept = 2, color = "red", size = 1.5, linetype = "dashed") +
geom_vline(xintercept = 7, color = "darkred", size = 1.5, linetype = "dashed") +
ggtitle("Cumulative Proportion of Attrition by Years with Current Manager") + ylab("Cumulative Proportion of Attrition") +
xlab("Years With Current Manager")
cs2.data %>% filter(Attrition == "Yes") %>% ggplot(aes(x = YearsInCurrentRole)) + geom_histogram(binwidth = 1, fill = "#00BFC4") +
aes(y = stat(count)/sum(stat(count))) + scale_y_continuous(labels = scales::percent) + ylab("Percent Attrition") + xlab("Years in Current Role") +
ggtitle("Distribution of Attrition by Years in Current Role") +
geom_vline(xintercept = 2, color = "red", size = 1.5, linetype = "dashed") +
scale_x_continuous(breaks = sort(c(seq(0,20,5), 3)), limits = c(0,20), expand = c(0,0))
# Of those who leave 40% do so before the end of the first year, 60% by year 3, and by year 7 90%
cs2.data %>% filter(Attrition == "Yes") %>% ggplot(aes(x = YearsInCurrentRole)) + stat_ecdf() +
scale_y_continuous(breaks = seq(0,1,0.1)) + scale_x_continuous(breaks = seq(1,15,1)) +
geom_vline(xintercept = 2, color = "red", size = 1.5, linetype = "dashed") +
geom_vline(xintercept = 7, color = "darkred", size = 1.5, linetype = "dashed") +
ggtitle("Cumulative Proportion of Attrition by Years In Current Role") + ylab("Cumulative Proportion of Attrition") +
xlab("Years In Current Role")
cs2.data %>% filter(Attrition == "Yes") %>% ggplot(aes(x = YearsAtCompany)) + geom_histogram(binwidth = 1, fill = "#00BFC4") +
aes(y = stat(count)/sum(stat(count))) + scale_y_continuous(labels = scales::percent) + ylab("Percent Attrition") + xlab("Years at Company") +
ggtitle("Distribution of Attrition by Years at Company") +
geom_vline(xintercept = 2, color = "red", size = 1.5, linetype = "dashed") +
scale_x_continuous(breaks = sort(c(seq(0,20,5), 3)), limits = c(0,20), expand = c(0,0))
# Of those who leave 40% do so before the end of the first year, 60% by year 3, and by year 7 90%
cs2.data %>% filter(Attrition == "Yes") %>% ggplot(aes(x = YearsAtCompany)) + stat_ecdf() +
scale_y_continuous(breaks = seq(0,1,0.1)) + scale_x_continuous(breaks = seq(1,20,1), limits = c(0,20)) +
geom_vline(xintercept = 2, color = "red", size = 1.5, linetype = "dashed") +
geom_vline(xintercept = 10, color = "darkred", size = 1.5, linetype = "dashed") +
ggtitle("Cumulative Proportion of Attrition by Years At Company") + ylab("Cumulative Proportion of Attrition") +
xlab("Years At Company")
## Jobfullfillment and Marital Status
# No change in worklife balance between the groups
g1 = ggplot(cs2.data) +
aes(x = MaritalStatus, y = WorkLifeBalance, fill = MaritalStatus) +
geom_boxplot() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x = element_blank()) +
scale_fill_manual(values= c("firebrick1", "darkorchid2", "deepskyblue"))
# Single generally make slightly less money than other two, likely age dependent
g2 = ggplot(cs2.data) +
aes(x = MaritalStatus, y = MonthlyIncome, fill = MaritalStatus) +
geom_boxplot() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x = element_blank()) +
scale_fill_manual(values= c("firebrick1", "darkorchid2", "deepskyblue"))
# Divorced are reporting lower job satisfactions though their median satisfaction is unchanged
g3 = ggplot(cs2.data) +
aes(x = MaritalStatus, y = JobSatisfaction, fill = MaritalStatus) +
geom_boxplot() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x = element_blank()) +
scale_fill_manual(values= c("firebrick1", "darkorchid2", "deepskyblue"))
g11 = ggdraw() + draw_label("Marital Status and Job Fulfillment", fontface = 'bold', x = 0, hjust = 0) + theme(plot.margin = margin(0,0,0,7))
plot_grid(g11,NULL,NULL,NULL,NULL,g1,NULL,g2,NULL,g3, ncol = 5, nrow = 2, rel_widths = c(2,0.5,2,0.5,2), rel_heights = c(0.5,4))
## Jobfullfillment and Gender
# No change in worklife balance between the groups
g1 = ggplot(cs2.data) +
aes(x = Gender, y = WorkLifeBalance, fill = Gender) +
geom_boxplot() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("pink", "dodgerblue1"))
# Single generally make slightly less money than other two, likely age dependent
g2 = ggplot(cs2.data) +
aes(x = Gender, y = MonthlyIncome, fill = Gender) +
geom_boxplot() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("pink", "dodgerblue1"))
# Divorced are reporting lower job satisfactions though their median satisfaction is unchanged
g3 = ggplot(cs2.data) +
aes(x = Gender, y = JobSatisfaction, fill = Gender) +
geom_boxplot() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("pink", "dodgerblue1"))
g11 = ggdraw() + draw_label("Gender and Job Fulfillment", fontface = 'bold', x = 0, hjust = 0) + theme(plot.margin = margin(0,0,0,7))
plot_grid(g11,NULL,NULL,NULL,NULL,g1,NULL,g2,NULL,g3, ncol = 5, nrow = 2, rel_widths = c(2,0.5,2,0.5,2), rel_heights = c(0.5,4))
# No change in worklife balance between the groups
g1 = ggplot(cs2.data) +
aes(x = JobLevel, y = WorkLifeBalance, fill = JobLevel, group = JobLevel) +
geom_boxplot() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Single generally make slightly less money than other two, likely age dependent
g2 = ggplot(cs2.data) +
aes(x = JobLevel, y = MonthlyIncome, fill = JobLevel, group = JobLevel) +
geom_boxplot() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Divorced are reporting lower job satisfactions though their median satisfaction is unchanged
g3 = ggplot(cs2.data) +
aes(x = JobLevel, y = JobSatisfaction, fill = JobLevel, group = JobLevel) +
geom_boxplot() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
g11 = ggdraw() + draw_label("Job Level and Job Fulfillment", fontface = 'bold', x = 0, hjust = 0) + theme(plot.margin = margin(0,0,0,7))
plot_grid(g11,NULL,NULL,NULL,NULL,g1,NULL,g2,NULL,g3, ncol = 5, nrow = 2, rel_widths = c(2,0.5,2,0.5,2), rel_heights = c(0.5,4))
# No change in worklife balance between the groups
g1 = ggplot(cs2.data) +
aes(x = BusinessTravel, y = WorkLifeBalance, fill = BusinessTravel) +
geom_boxplot() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Single generally make slightly less money than other two, likely age dependent
g2 = ggplot(cs2.data) +
aes(x = BusinessTravel, y = MonthlyIncome, fill = BusinessTravel) +
geom_boxplot() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Divorced are reporting lower job satisfactions though their median satisfaction is unchanged
g3 = ggplot(cs2.data) +
aes(x = BusinessTravel, y = JobSatisfaction, fill = BusinessTravel) +
geom_boxplot() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
g11 = ggdraw() + draw_label("Business Travel and Job Fulfillment", fontface = 'bold', x = 0, hjust = 0) + theme(plot.margin = margin(0,0,0,7))
# Yep singles are generally younger...
ggplot(cs2.data) +
aes(x = MaritalStatus, y = Age, fill = MaritalStatus) +
geom_boxplot() + theme(legend.position = "none") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
plot_grid(g11,NULL,NULL,NULL,NULL,g1,NULL,g2,NULL,g3, ncol = 5, nrow = 2, rel_widths = c(2,0.5,2,0.5,2), rel_heights = c(0.5,4))
# Here we can see that overtime and education field effect attrition. Those who work overtime are much more likely to leave.
# Most severe in marketing - Human resources seems to have a lot no matter what
cs2.data %>%
group_by(OverTime, EducationField, Attrition) %>%
summarise(count = n()) %>%
mutate(Perc = (count/sum(count))) %>%
ggplot(aes(x = EducationField, y = Perc, fill = Attrition, group = Attrition)) +
geom_col() + facet_wrap(~OverTime) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_text(aes(x = EducationField, y = Perc, label = round(Perc,3), group = Attrition), position = position_stack(vjust = 0.5)) +
ggtitle("Education Field by Overtime and Attrition")
## `summarise()` regrouping output by 'OverTime', 'EducationField' (override with `.groups` argument)
cs2.data %>%
group_by(OverTime, JobLevel, Attrition) %>%
summarise(count = n()) %>%
mutate(Perc = (count/sum(count))) %>%
ggplot(aes(x = JobLevel, y = Perc, fill = Attrition, group = Attrition)) +
geom_col() + facet_wrap(~OverTime) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_text(aes(x = JobLevel, y = Perc, label = round(Perc,3), group = Attrition), position = position_stack(vjust = 0.5)) +
ggtitle("Job Level by Overtime and Attrition")
## `summarise()` regrouping output by 'OverTime', 'JobLevel' (override with `.groups` argument)
cs2.data %>%
group_by(OverTime, MaritalStatus, Attrition) %>%
summarise(count = n()) %>%
mutate(Perc = (count/sum(count))) %>%
ggplot(aes(x = MaritalStatus, y = Perc, fill = Attrition, group = Attrition)) +
geom_col() + facet_wrap(~OverTime) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_text(aes(x = MaritalStatus, y = Perc, label = round(Perc,3), group = Attrition), position = position_stack(vjust = 0.5)) +
ggtitle("Marital Status by Overtime and Attrition")
## `summarise()` regrouping output by 'OverTime', 'MaritalStatus' (override with `.groups` argument)
cs2.data %>%
group_by(OverTime, JobRole, Attrition) %>%
summarise(count = n()) %>%
mutate(Perc = (count/sum(count))) %>%
ggplot(aes(x = JobRole, y = Perc, fill = Attrition, group = Attrition)) +
geom_col() + facet_wrap(~OverTime) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_text(aes(x = JobRole, y = Perc, label = round(Perc,3), group = Attrition), position = position_stack(vjust = 0.5), size = 2.5) +
ggtitle("Job Role by Overtime and Attrition")
## `summarise()` regrouping output by 'OverTime', 'JobRole' (override with `.groups` argument)
#g1 = cs2.data %>% group_by(EmployeeCount) %>% summarise(count = n()) %>% ggplot(aes(x = EmployeeCount, y = count)) + geom_boxplot() +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.y = element_blank()) + ggtitle("Employee Count is Constant at 870")
## List of 4
## $ axis.title.y: list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ axis.text.x : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ axis.ticks.x: list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ title : chr "Employee Count is Constant at 870"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
#g2 = cs2.data %>%
#group_by(Over18) %>% summarise(count = n()) %>%
#ggplot(aes(x = Over18, y = count, fill = Over18)) + geom_bar(stat = "identity", width = 0.5) +
#theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), legend.position = "none", axis.title.y = element_blank()) +
#geom_text(aes(label = count), vjust = -0.25) +
#ggtitle("All Employees are Over 18") + scale_fill_manual(values = c("steelblue"))
#g11 = ggdraw() + draw_label("No Information Variables", fontface = 'bold', x = 0, hjust = 0) + theme(plot.margin = margin(0,0,0,7))
#plot_grid(g11, NULL, g1,g2,cols = 2, rows = 2, rel_heights = c(0.25,2))
cs2.data %>%
group_by(Attrition) %>%
summarize(count = n()) %>%
mutate(Perc = count/sum(count)) %>%
ggplot(aes(x = Attrition, y = Perc, fill = Attrition)) + geom_col() +
geom_text(aes(label = round(Perc,3)), vjust = -0.25) +
ggtitle("Attrition Balance in Data set")
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(cs2.data) +
aes(x = JobRole, y = DistanceFromHome, fill = Attrition) +
geom_boxplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 7), axis.title.x = element_blank()) +
ggtitle("Job Role by Distance from Home and Attrition")
ggplot(cs2.data) +
aes(x = OverTime, y = DistanceFromHome, fill = Attrition) +
geom_boxplot()
ggplot(cs2.data) +
aes(x = MaritalStatus, y = DistanceFromHome, fill = Attrition) +
geom_boxplot()
ggplot(cs2.data) +
aes(x = BusinessTravel, y = DistanceFromHome, fill = Attrition) +
geom_boxplot()
# Possibly Important Continous Variables
g1 = ggplot(cs2.data, aes(x = Attrition, y = YearsWithCurrManager, fill = Attrition)) + geom_boxplot() +
theme(legend.position = "none",axis.title.x = element_blank()) + ylab("YrsWManager")
g2 = ggplot(cs2.data, aes(x = Attrition, y = YearsAtCompany, fill = Attrition)) + geom_boxplot() +
theme(legend.position = "none",axis.title.x = element_blank()) + ylab("YrsAtCo.")
g3 = ggplot(cs2.data, aes(x = Attrition, y = YearsInCurrentRole, fill = Attrition)) + geom_boxplot() +
theme(legend.position = "none",axis.title.x = element_blank()) + ylab("YrsInRole")
g4 = ggplot(cs2.data, aes(x = Attrition, y = Age, fill = Attrition)) + geom_boxplot() +
theme(legend.position = "none",axis.title.x = element_blank())
g5 = ggplot(cs2.data, aes(x = Attrition, y = MonthlyIncome, fill = Attrition)) + geom_boxplot() +
theme(legend.position = "none",axis.title.x = element_blank())
g6 = ggplot(cs2.data, aes(x = Attrition, y = TotalWorkingYears, fill = Attrition)) + geom_boxplot() +
theme(legend.position = "none",axis.title.x = element_blank()) + ylab("WorkingYears")
g7 = ggplot(cs2.data, aes(x = Attrition, y = DistanceFromHome, fill = Attrition)) + geom_boxplot() +
theme(axis.title.x = element_blank())
g11 = ggdraw() + draw_label("Largest Spread Continous Variables", fontface = 'bold', x = 0, hjust = 0) + theme(plot.margin = margin(0,0,0,7))
plot_grid(g11,NULL,NULL,g1,g2,g3,g4,g5,g6,NULL,g7,NULL, rows = 4, cols = 3, rel_heights = c(0.5,2,2,2))
## Warning in plot_grid(g11, NULL, NULL, g1, g2, g3, g4, g5, g6, NULL, g7, :
## Argument 'cols' is deprecated. Use 'ncol' instead.
## Warning in plot_grid(g11, NULL, NULL, g1, g2, g3, g4, g5, g6, NULL, g7, :
## Argument 'rows' is deprecated. Use 'nrow' instead.
g1 = cs2.data %>% group_by(JobInvolvement, Attrition) %>% summarise(count = n()) %>% mutate(Perc = (count/sum(count))) %>%
ggplot(aes(x = JobInvolvement, y = Perc, fill = Attrition)) + geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.65, hjust = 1), legend.position = "none", axis.title.y = element_blank())
## `summarise()` regrouping output by 'JobInvolvement' (override with `.groups` argument)
g2 = cs2.data %>% group_by(JobLevel, Attrition) %>% summarise(count = n()) %>% mutate(Perc = (count/sum(count))) %>%
ggplot(aes(x = JobLevel, y = Perc, fill = Attrition)) + geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.65, hjust = 1), legend.position = "none", axis.title.y = element_blank())
## `summarise()` regrouping output by 'JobLevel' (override with `.groups` argument)
g3 = cs2.data %>% group_by(JobRole, Attrition) %>% summarise(count = n()) %>% mutate(Perc = (count/sum(count))) %>%
ggplot(aes(x = JobRole, y = Perc, fill = Attrition)) + geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.65, hjust = 1, size = 8),
legend.position = "none", axis.title.y = element_blank())
## `summarise()` regrouping output by 'JobRole' (override with `.groups` argument)
g4 = cs2.data %>% group_by(MaritalStatus, Attrition) %>% summarise(count = n()) %>% mutate(Perc = (count/sum(count))) %>%
ggplot(aes(x = MaritalStatus, y = Perc, fill = Attrition)) + geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.65, hjust = 1), legend.position = "none", axis.title.y = element_blank())
## `summarise()` regrouping output by 'MaritalStatus' (override with `.groups` argument)
g5 = cs2.data %>% group_by(NumCompaniesWorked, Attrition) %>% summarise(count = n()) %>% mutate(Perc = (count/sum(count))) %>%
ggplot(aes(x = NumCompaniesWorked, y = Perc, fill = Attrition)) + geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.65, hjust = 1), legend.position = "none", axis.title.y = element_blank())
## `summarise()` regrouping output by 'NumCompaniesWorked' (override with `.groups` argument)
g6 = cs2.data %>% group_by(OverTime, Attrition) %>% summarise(count = n()) %>% mutate(Perc = (count/sum(count))) %>%
ggplot(aes(x = OverTime, y = Perc, fill = Attrition)) + geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.65, hjust = 1), legend.position = "none", axis.title.y = element_blank())
## `summarise()` regrouping output by 'OverTime' (override with `.groups` argument)
g7 = cs2.data %>% group_by(StockOptionLevel, Attrition) %>% summarise(count = n()) %>% mutate(Perc = (count/sum(count))) %>%
ggplot(aes(x = StockOptionLevel, y = Perc, fill = Attrition)) + geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.65, hjust = 1), legend.position = "none", axis.title.y = element_blank())
## `summarise()` regrouping output by 'StockOptionLevel' (override with `.groups` argument)
g8 = cs2.data %>% group_by(WorkLifeBalance, Attrition) %>% summarise(count = n()) %>% mutate(Perc = (count/sum(count))) %>%
ggplot(aes(x = WorkLifeBalance, y = Perc, fill = Attrition)) + geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.65, hjust = 1), legend.position = "none", axis.title.y = element_blank())
## `summarise()` regrouping output by 'WorkLifeBalance' (override with `.groups` argument)
g11 = ggdraw() + draw_label("Largest Difference Categorical Variables", fontface = 'bold', x = 0, hjust = 0) + theme(plot.margin = margin(0,0,0,7))
plot_grid(g11,NULL,NULL,NULL,g1,g2,g3,g4,NULL,NULL,NULL,NULL,g5,g6,g7,g8, cols = 4, rows = 4, rel_heights = c(0.3,2,0.5,2))
## Warning in plot_grid(g11, NULL, NULL, NULL, g1, g2, g3, g4, NULL, NULL, :
## Argument 'cols' is deprecated. Use 'ncol' instead.
## Warning in plot_grid(g11, NULL, NULL, NULL, g1, g2, g3, g4, NULL, NULL, :
## Argument 'rows' is deprecated. Use 'nrow' instead.